library(tidyverse)
library(plotly)
library(readxl)
library(lubridate)
library(sp) #for analyzing spatial data
library(sf) #for analyzing spatial data
library(rgdal) #for importing zips shapefile and transform CRS
library(leaflet) #for interactive maps
library(tigris) #geojoin
library(htmlwidgets) #save interactive maps

knitr::opts_chunk$set(
  fig.width = 10,
  fig.asp = .2,
  out.width = "90%",
  dpi = 300,
  warning = FALSE,
  message = FALSE
)

Define our dataframe crime_df_map.

#read crime_df data
crime_df = read_csv("data/crime_df.csv") %>%
  janitor::clean_names() %>%
  rename(case_number = x1)

#define crime_df_map for our map
crime_df_map = drop_na(crime_df, latitude)

Simple map

Get a first look of our dataframe by making a simple map draft. Drag the slider below to see spatial distribution of crime incidents in different time period during a day.

crime_plot =
  crime_df %>% 
  filter(!is.na(borough)) %>%
  plot_ly(
    lat = ~latitude, 
    lon = ~longitude, 
    frame = ~hour_of_day,
    type = "scattermapbox", 
    mode = "markers", 
    alpha = 0.2,
    color = ~borough) %>% 
  layout(
    title = "Spatial distribution of crime incidents",
    mapbox = list(
      style = 'carto-positron',
      zoom = 9,
      center = list(lon = -73.9, lat = 40.7)),
    legend = list(title=list(text='<b> Borough </b>'))
    ) %>%
  animation_slider(
    currentvalue = list(prefix = "Hour of day: ", font = list(color="black"))
  )
crime_plot

Crime rate interactive map categorized by modified zipcodes

We want to first draw a crime rate map to examine the crime rate in each zip code region. Crime rate for a zip code region is defined as number of crime incidents occurred in the zip code region during 2016-01-01 and 2017-12-31, divided by population size in the corresponding zip code region.

Data preparation

Import zips shapefile and transform CRS

zips = readOGR("data/cb_2015_us_zcta510_500k/cb_2015_us_zcta510_500k.shp")
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Users\30535\OneDrive\Documents\Columbia courses\8105\crimeweather_final_project.github.io\data\cb_2015_us_zcta510_500k\cb_2015_us_zcta510_500k.shp", layer: "cb_2015_us_zcta510_500k"
## with 33144 features
## It has 5 fields
## Integer64 fields read as strings:  ALAND10 AWATER10
zips = spTransform(zips, CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"))

Match each crime incident to a zip code region by utilizing their coordinates variables longitude and latitude. Include a zip variable to crime_df_map data frame, representing corresponding zip codes of each crime incident.

#extract only lon and lat
crime_lat_long = select(crime_df_map, longitude, latitude)

#transform coordinates into a SpatialPointsDataFrame
crime_spdf = SpatialPointsDataFrame(coords = crime_lat_long, data = crime_lat_long, proj4string = CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"))

#subset only the zipcodes in which points are found
crime_zips = zips[crime_spdf, ]

#redefine crime_df_map to include zipcodes
crime_df_map = cbind(crime_df_map, over(crime_spdf, crime_zips[,"ZCTA5CE10"]))
crime_df_map = rename(crime_df_map, zip = ZCTA5CE10)

300 crime incidents can’t be matched to a zip code, so we drop them.

crime_df_map =
  crime_df_map %>%
  drop_na(zip)

In order to make meaningful data analysis, we want zip codes to represent regions with similar population. However, some zip code regions are very small, among which there are even some buildings that have their own zip codes. To deal with the challenges of zip codes, we convert zip codes to modified zip codes to account for discrepancies in population size. Modified zip codes represent regions with similar population size. In order to make the conversion, we use a conversion table ZCTA-to-MODZCTA.csv obtained from here.

Read the conversion table ZCTA-to-MODZCTA.csv.

zcta_conv = read_csv("data/ZCTA-to-MODZCTA.csv")
zcta_conv$ZCTA = as.character(zcta_conv$ZCTA)
zcta_conv$MODZCTA = as.character(zcta_conv$MODZCTA)

Match zip in crime_df_map with corresponding modified zip code, represented by a new variable mod_zip.

crime_df_map = crime_df_map %>%
  left_join(rename(zcta_conv, zip = ZCTA), by = "zip") %>%
  rename(mod_zip = MODZCTA)

0 crime incidents can’t be matched to a modified zip code and leave NA values in the mod_zip column. This is because the 0 crime incidents happened in zip code 10550, which is not in New York City. Therefore, we drop the 0 observations.

crime_df_map = drop_na(crime_df_map, mod_zip)

In order to make an interactive map based on modified zip codes, we use modzcta shapefile obtained from here. Note this shapefile is based on modified zip codes instead of standard zip codes.

modzcta = st_read("data/MODZCTA_2010/MODZCTA_2010.shp") %>%
  janitor::clean_names()
## Reading layer `MODZCTA_2010' from data source 
##   `C:\Users\30535\OneDrive\Documents\Columbia courses\8105\crimeweather_final_project.github.io\data\MODZCTA_2010\MODZCTA_2010.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 178 features and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 913176 ymin: 120122 xmax: 1067382 ymax: 272844
## Projected CRS: Lambert_Conformal_Conic

Now, include shapefile information into crime_df_map.

crime_df_geo =
  geo_join(modzcta, crime_df_map, "modzcta", "mod_zip", how = "inner") %>%
  rename(mod_zip = modzcta)

We can then obtain number of crimes in each zip region, sorting in descending order.

number_of_crimes = 
  crime_df_geo %>% 
  group_by(mod_zip) %>% 
  summarize(number_of_crimes = n()) %>%
  arrange(desc(number_of_crimes))

In order to obtain crime rate, we get population of each zip code region from here.

pop_zip = read_csv("data/csvData.csv") %>%
  janitor::clean_names()
pop_zip$zip = as.character(pop_zip$zip)

#filter out regions in New York City
pop_zip = pop_zip %>%
  filter(county %in% c("New York", "Kings", "Queens", "Bronx", "Richmond"))

The dataset pop_zip lacks population data in zip code region 10065, so we manually add it in (data source here).

pop_zip[nrow(pop_zip) + 1,] = list("10065","    
New York City", "New York", 30339)

We then convert zip code to modified zip code, and get the population in each modified zip code region.

pop_mod_zip = pop_zip %>%
  left_join(rename(zcta_conv, zip = ZCTA), by = "zip") %>%
  rename(mod_zip = MODZCTA) %>%
  group_by(mod_zip) %>%
  summarize(population = sum(population))

Add population of each modified zip code region into number_of_crimes, and calculate crime rate in each modified zip code region. Crime rates are sorted in descending order.

crime_rate = left_join(number_of_crimes, pop_mod_zip, by = "mod_zip")
crime_rate$crime_rate = crime_rate$number_of_crimes / crime_rate$population * 100
crime_rate = 
  crime_rate %>%
  select(mod_zip, crime_rate) %>%
  group_by(crime_rate) %>%
  arrange(desc(crime_rate))

Make interactive map of crime rate.

#label
labels = sprintf(
  "<strong>%s</strong><br/>Crime rate is %g&percnt; during 2016-01-01 and 2017-12-31.", crime_rate$mod_zip, crime_rate$crime_rate) %>%
  lapply(htmltools::HTML)

#color palette
pal = colorBin(palette = "OrRd", 9, domain = crime_rate$crime_rate)

map_interactive = crime_rate %>%
  st_transform(crs = "+init=epsg:4326") %>%
  leaflet() %>%
  addProviderTiles(provider = "CartoDB.Positron") %>%
  addPolygons(label = labels,
              stroke = FALSE,
              smoothFactor = 0.5,
              opacity = 1,
              fillOpacity = 0.7,
              fillColor = ~ pal(crime_rate),
              highlightOptions = highlightOptions(weight = 5,
                                                  fillOpacity = 1,
                                                  color = "black",
                                                  opacity = 1,
                                                  bringToFront = TRUE)) %>%
  addLegend("bottomright",
            pal = pal,
            values = ~ crime_rate,
            title = "Crime cases per 100 residents during the period",
            opacity = 0.7) %>%
  addTiles("Interactive map of crime rate")
saveWidget(map_interactive, "nyc_crime_rate_map.html")
map_interactive

Crime data Shiny map categorized by modified zip codes and 4 weeks interval

crime_month data preparation

We cut date into 4 weeks interval, and want to visualize crime counts and crime rate in each modified zip code region for every 4 weeks since 2016-01-01 by using R Shiny.

crime_month = crime_df_map %>% #use crime_df_map instead of crime_df_geo here because crime_df_geo is too large 
  group_by(mod_zip, date) %>%
  summarize(crime_counts = n())
crime_month$date = cut(crime_month$date, "28 days")
crime_month =  crime_month %>%
  group_by(mod_zip, date) %>%
  summarize(crime_counts = sum(crime_counts)) %>%
  rename(month_following = date) %>%
  filter(month_following != "2017-12-29") #observations for 2017-12-29 removed for too few data

#add back mod_zip, crime_rate, and geometry
crime_month = left_join(crime_month, pop_mod_zip, by = "mod_zip") %>%
  select(mod_zip, population, month_following, crime_counts)
crime_month$crime_rate = crime_month$crime_counts / crime_month$population
crime_month = crime_month %>%
  select(month_following, mod_zip, population, crime_counts, crime_rate) %>%
  arrange(month_following)
crime_month = geo_join(modzcta, crime_month, "modzcta", "mod_zip", how = "inner")
crime_month$month_following = as.Date(crime_month$month_following, format = "%Y-%m-%d")
#saveRDS(crime_month, "r_shiny_map/crime_month.RDS") #save for R Shiny

Shiny app

Put a link here.
Put a link here.
Put a link here.
Put a link here.
Put a link here.
Put a link here.
Put a link here.
Put a link here.
Put a link here.
Put a link here.